Data Understanding¶

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px 
import plotly.graph_objects as go
In [2]:
df = pd.read_csv('./data/Movie Rating Dataset.csv')
In [3]:
# Keeps the relevant columns
print(f'cols before: {df.columns}')
df = df[['Title','Genre','Tags', 'Languages','Series or Movie','Runtime','Director','Writer','Actors','Release Date','Summary', 'IMDb Score']]
print(f'After changing columns: {df.columns}')
cols before: Index(['Title', 'Genre', 'Tags', 'Languages', 'Series or Movie',
       'Hidden Gem Score', 'Country Availability', 'Runtime', 'Director',
       'Writer', 'Actors', 'View Rating', 'IMDb Score',
       'Rotten Tomatoes Score', 'Metacritic Score', 'Awards Received',
       'Awards Nominated For', 'Boxoffice', 'Release Date',
       'Netflix Release Date', 'Production House', 'Netflix Link', 'IMDb Link',
       'Summary', 'IMDb Votes', 'Image', 'Poster', 'TMDb Trailer',
       'Trailer Site'],
      dtype='object')
After changing columns: Index(['Title', 'Genre', 'Tags', 'Languages', 'Series or Movie', 'Runtime',
       'Director', 'Writer', 'Actors', 'Release Date', 'Summary',
       'IMDb Score'],
      dtype='object')

Deleting instances where target variable is null¶

In [4]:
# How many instances?
print(f'How many instances: {df.shape[0]}')

# How many nulls at 'IMDb Score'
print(f'How many nulls in IMDb Score col: {df['IMDb Score'].isnull().sum()}')

# Dropping rows with null value at IMDb Score col
df.dropna(subset=['IMDb Score'], inplace=True)
# number of instances after removing null value at IMDb Score col
print(f'How many instances after dropping nulls in IMDb Score col: {df.shape[0]}')
How many instances: 15480
How many nulls in IMDb Score col: 2099
How many instances after dropping nulls in IMDb Score col: 13381

Target Variable¶

In [5]:
import seaborn as sns
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x="IMDb Score", stat='probability', bins=87, kde=True)
plt.title("IMDb Score's Distribution")
plt.savefig('./results/IMDb Score Distribution.png', facecolor='white', edgecolor='white')
plt.show()
In [6]:
# Get unique values and sort them in ascending order
sorted_unique_scores = sorted(df['IMDb Score'].unique())
print(sorted_unique_scores)
[1.0, 1.4, 1.5, 1.6, 1.7, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.7]
In [7]:
# plot a table of: mean, median, mode, q1, q2, variance 
# Calculate descriptive statistics
mean = round(df['IMDb Score'].mean(),3)
median = df['IMDb Score'].median()
mode = df['IMDb Score'].mode().values[0] if not df['IMDb Score'].mode().empty else float('nan')
q1 = df['IMDb Score'].quantile(0.25)
q2 = df['IMDb Score'].quantile(0.75)
variance = round(df['IMDb Score'].var(),3)
standard_deviation = round(df['IMDb Score'].std(), 3)
# Prepare data for the table
descriptive_stats = pd.DataFrame({
    'Statistic': ['Mean', 'Median', 'Mode', 'Q1', 'Q3', 'Variance', 'Standard deviation'],
    'Value': [mean, median, mode, q1, q2, variance, standard_deviation]
    })

# Create a Plotly table
fig = go.Figure(data=[go.Table(
    header=dict(values=list(descriptive_stats.columns),
                fill_color= '#636EFA',
                # fill_color='paleturquoise',
                align='left',
                font=dict(color='black', size=15)),
    cells=dict(values=[descriptive_stats.Statistic, descriptive_stats.Value],
               fill_color='lavender',
               align='left',
               height=25,
               font=dict(color='black', size=14)))
])
fig.update_layout(template='plotly_white',
                  width=500,
                  title={'text': 'Descriptive Statistics - IMDB Score (Target)',
                         'y': 0.85,
                         'x':  0.5,
                         'xanchor': 'center', 
                         'font': dict(color='black',weight='bold')}
                  )

config = {
    'toImageButtonOptions': {
        'filename': 'Descriptive Statistics - Rating',
    }
}

fig.show(config=config)
In [8]:
# Duplicates - Identify Duplicates based on 'Title', 'Release Date', and 'Series or Movie'
duplicates = df[df.duplicated(subset=['Title', 'Release Date', 'Series or Movie'], keep='first')]
num_duplicates = len(duplicates)
print(f'Number of duplicate rows identified: {num_duplicates}')

# Drop duplicates
df.drop_duplicates(subset=['Title', 'Release Date', 'Series or Movie'], keep='first', inplace=True)
df.reset_index(drop=True, inplace=True)

data_len_after = len(df)
print(f'Total number of rows after dropping duplicates: {data_len_after}')
Number of duplicate rows identified: 82
Total number of rows after dropping duplicates: 13299
In [9]:
# validate there are no duplicates 
has_duplicates = df.duplicated(subset=['Title', 'Release Date', 'Series or Movie'], keep='first').any()

if has_duplicates: 
    print('there are duplicates')
else: 
    print('there are no duplicates')
there are no duplicates

describe predictors¶

In [10]:
df.columns
Out[10]:
Index(['Title', 'Genre', 'Tags', 'Languages', 'Series or Movie', 'Runtime',
       'Director', 'Writer', 'Actors', 'Release Date', 'Summary',
       'IMDb Score'],
      dtype='object')
In [11]:
# Predictors - Null Values 
column_info = []
total_rows = len(df)

# create the table
for col in df.columns:
    col_name = col 
    col_nulls = df[col].isnull().sum()
    col_unique = df[col].nunique()
    col_null_percent = (col_nulls / total_rows) * 100 
    
    # adding the columns 
    column_info.append({
        'Column Name': col_name,
        'Unique Values': col_unique,
        'Null Percent': round(col_null_percent, 2) 
    })
    
column_info_df = pd.DataFrame(column_info)


# plot the table 
fig = go.Figure(data=[go.Table(
    header=dict(values=list(column_info_df.columns),
                fill_color= '#636EFA',
                align='left',
                font=dict(color='black', size=15)),
    cells=dict(values=[column_info_df['Column Name'], column_info_df['Unique Values'], column_info_df['Null Percent']],
               fill_color='lavender',
               align='left',
               height=25,
               font=dict(color='black', size=13)))
])


fig.update_layout(template='plotly_white',
                  width=550,
                  height= 550,
                  title={'text': 'Predictors - Null Values', 
                         'y': 0.85,
                         'x': 0.5,
                         'xanchor': 'center', 
                         'font': dict(color='black',weight='bold')}
                  )
fig.show()
In [12]:
# Plotting the columns with missing values 
missing_values = df.isnull().sum()


fig, ax = plt.subplots(figsize=(14, 8))  
fig.patch.set_facecolor('white')  
ax.set_facecolor('white') 

# bar plot
missing_values.plot(kind='bar', color='skyblue', ax=ax)
ax.set_title('Count of Missing Values in Each Column')
ax.set_xlabel('Columns')
ax.set_ylabel('Number of Missing Values')
ax.set_xticks(range(len(missing_values.index)))
ax.set_xticklabels(missing_values.index, rotation=45)

ax.grid(axis='y', linestyle='--', alpha=0.7)


plt.savefig('./results/Missing Values Distribution.png', bbox_inches='tight', facecolor=fig.get_facecolor())
plt.show()

Release Date¶

Extract month and year, and plot each independently

In [13]:
# convert to datetime type
df['Release Date'] = pd.to_datetime(df['Release Date'])

# extract month and year out of 'Release Date'
df['released_day'] = df['Release Date'].dt.day
print(f'head day: {df['released_day'].head()}')

df['released_month'] = df['Release Date'].dt.month
print(f'head month: {df['released_month'].head()}')

df['released_year'] = df['Release Date'].dt.year
print(f'head year: {df['released_year'].head()}')

# convert to int 
df['released_day'] = df['released_day'].astype('Int64')
df['released_month'] = df['released_month'].astype('Int64')
df['released_year'] = df['released_year'].astype('Int64')

print(df['released_day'].dtype)
head day: 0    12.0
1     8.0
2    28.0
3     1.0
4    22.0
Name: released_day, dtype: float64
head month: 0    12.0
1     5.0
2     8.0
3    10.0
4     9.0
Name: released_month, dtype: float64
head year: 0    2008.0
1    2020.0
2    2020.0
3    2016.0
4    2011.0
Name: released_year, dtype: float64
Int64
In [14]:
# check if the data is obsolete 
print(f'unique sorter years: \n {sorted(df['released_year'].dropna().unique())}')
print(f'Minimum year: {df['released_year'].min()}')
print(f'Maximum year: {df['released_year'].max()}')
unique sorter years: 
 [1910, 1913, 1915, 1916, 1918, 1920, 1921, 1923, 1924, 1925, 1927, 1928, 1929, 1930, 1931, 1932, 1934, 1936, 1937, 1938, 1939, 1940, 1941, 1942, 1943, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]
Minimum year: 1910
Maximum year: 2021
In [15]:
# Plot avg rating by month 
monthly_data = df.groupby('released_month')['IMDb Score'].mean().reset_index()
fig = px.bar(monthly_data, x='released_month', y='IMDb Score', text='IMDb Score', title='Average Rating by Released Month')
fig.update_traces(
    textposition= 'outside',
    texttemplate='%{text:.2f}',
    textfont_size= 16,
    marker_color= '#636EFA',
)
unique_scores = sorted(df['IMDb Score'].unique())
fig.update_layout(
    xaxis_title = 'Released Month',
    yaxis_title = 'Avg IMDb Score',
    title_x=0.5,
    title_y=0.85,
    xaxis_title_font=dict(size=17),
    yaxis_title_font=dict(size=17),
    title_font=dict(size=22, family='Arial Black'),
    width = 650,
    height = 400,
    xaxis=dict(tickmode='linear', 
               tickfont=dict(size=14)),
    yaxis=dict(tickmode='linear',
               tickvals=unique_scores,
               range=[min(unique_scores)-0.5, max(unique_scores)+0.5],
               tickfont=dict(size=14)
               ),
    template='plotly_white',
)


config = {
    'toImageButtonOptions': {
        'filename': 'Rating and Month'
    }
}

fig.show(config=config)
In [16]:
# Plot avg rating by year
yearly_data = df.groupby('released_year')['IMDb Score'].mean().reset_index()
# keeps only the top 15 
yearly_data = yearly_data.sort_values(by='released_year', ascending=False).head(15)

fig = px.bar(yearly_data, x='released_year', y='IMDb Score', text='IMDb Score', title='Average Rating by Released Year')
fig.update_traces(
    textposition= 'outside',
    texttemplate='%{text:.2f}',
    textfont_size= 16,
    marker_color= '#636EFA',
)
unique_scores = sorted(df['IMDb Score'].unique())
fig.update_layout(
    xaxis_title = 'Released Year',
    yaxis_title = 'Avg IMDb Score',
    title_x=0.5,
    title_y=0.85,
    width = 650,
    height = 400,
    xaxis_title_font=dict(size=17),
    yaxis_title_font=dict(size=17),
    title_font=dict(size=22, family='Arial Black'),
    xaxis=dict(tickmode='linear', 
               tickfont=dict(size=14),
               tickangle=45
    ),
    yaxis=dict(tickmode='linear',
               tickvals=unique_scores,
               range=[min(unique_scores)-0.5, max(unique_scores)+0.5],
               tickfont=dict(size=14)
               ),
    template='plotly_white',
)

config = {
    'toImageButtonOptions': {
        'filename': 'Rating and Year'
    }
}

fig.show(config=config)
In [17]:
sorted_unique_years = sorted(df['released_year'].dropna().unique())
print(sorted_unique_years)
[1910, 1913, 1915, 1916, 1918, 1920, 1921, 1923, 1924, 1925, 1927, 1928, 1929, 1930, 1931, 1932, 1934, 1936, 1937, 1938, 1939, 1940, 1941, 1942, 1943, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]
In [18]:
# maybe by day? 
daily_data = df.groupby('released_day')['IMDb Score'].mean().reset_index()
# keeps only the top 15 
# yearly_data = yearly_data.sort_values(by='released_day', ascending=False).head(15)

fig = px.bar(daily_data, x='released_day', y='IMDb Score', text='IMDb Score', title='Average Rating by Released Day')
fig.update_traces(
    textposition= 'outside',
    texttemplate='%{text:.2f}',
    textfont_size= 16,
    marker_color= '#636EFA',
)
unique_scores = sorted(df['IMDb Score'].unique())
fig.update_layout(
    xaxis_title = 'Released Day',
    yaxis_title = 'Avg IMDb Score',
    title_x=0.5,
    xaxis_title_font=dict(size=17),
    yaxis_title_font=dict(size=17),
    title_font=dict(size=22),
    xaxis=dict(tickmode='linear', 
               tickfont=dict(size=14)),
    yaxis=dict(tickmode='linear',
               tickvals=unique_scores,
               range=[min(unique_scores)-0.5, max(unique_scores)+0.5],
               tickfont=dict(size=14)
               ),
    template='plotly_white',
)

config = {
    'toImageButtonOptions': {
        'filename': 'Rating and Day'
    }
}

fig.show(config=config)
In [19]:
df.columns
Out[19]:
Index(['Title', 'Genre', 'Tags', 'Languages', 'Series or Movie', 'Runtime',
       'Director', 'Writer', 'Actors', 'Release Date', 'Summary', 'IMDb Score',
       'released_day', 'released_month', 'released_year'],
      dtype='object')

Runtime¶

In [20]:
# Drop rows where 'Runtime' or 'IMDb Score' is NaN
df_clean = df.dropna(subset=['Runtime', 'IMDb Score'])
In [21]:
# Grouping data by Runtime
runtime_data = df.groupby('Runtime')['IMDb Score'].mean().reset_index()

# Creating the bar plot
fig = px.bar(runtime_data, x='Runtime', y='IMDb Score', text='IMDb Score', title='Average Rating by Runtime')


fig.update_traces(
    textposition='outside',
    texttemplate='%{text:.2f}',
    textfont_size=16,
    marker_color='#636EFA',
)

unique_scores = sorted(df['IMDb Score'].unique())
fig.update_layout(
    xaxis_title='Runtime',
    yaxis_title='Avg IMDb Score',
    title_x=0.5,
    title_y=0.85,
    xaxis_title_font=dict(size=17),
    yaxis_title_font=dict(size=17),
    title_font=dict(size=22, family='Arial Black'),
    width=650,
    height=400,
    xaxis=dict(
        tickmode='linear',
        tickfont=dict(size=14),
        tickangle=45  # Rotate the x-axis labels by 45 degrees if needed
    ),
    yaxis=dict(
        tickmode='linear',
        tickvals=unique_scores,
        range=[min(unique_scores) - 0.5, max(unique_scores) + 0.5],
        tickfont=dict(size=14)
    ),
    template='plotly_white',
)

config = {
    'toImageButtonOptions': {
        'filename': 'Rating and Runtime'
    }
}

fig.show(config=config)
In [22]:
# Grouping data by both Runtime and Series of Movie
runtime_series_data = df.groupby(['Runtime', 'Series or Movie'])['IMDb Score'].mean().reset_index()

# Creating the bar plot
fig = px.bar(runtime_series_data, x='Runtime', y='IMDb Score', color='Series or Movie', 
             text='IMDb Score', title='Average Rating by Runtime and Series or Movie')

fig.update_traces(
    textposition='outside',
    texttemplate='%{text:.2f}',
    textfont_size=16,
)

unique_scores = sorted(df['IMDb Score'].unique())
fig.update_layout(
    xaxis_title='Runtime',
    yaxis_title='Avg IMDb Score',
    title_x=0.5,
    title_y=0.85,
    barmode='group',  # Place bars next to each other instead of stacking
    xaxis_title_font=dict(size=17),
    yaxis_title_font=dict(size=17),
    title_font=dict(size=22, family='Arial Black'),
    width=650,
    height=500,
    xaxis=dict(
        tickmode='linear',
        tickfont=dict(size=14),
        tickangle=45  # Rotate the x-axis labels by 45 degrees if needed
    ),
    yaxis=dict(
        tickmode='linear',
        tickvals=unique_scores,
        range=[min(unique_scores) - 0.5, max(unique_scores) + 0.5],
        tickfont=dict(size=14)
    ),
    template='plotly_white',
)

config = {
    'toImageButtonOptions': {
        'filename': 'Rating_Runtime_Series'
    }
}

fig.show(config=config)

Series or Movie¶

In [23]:
df.columns
Out[23]:
Index(['Title', 'Genre', 'Tags', 'Languages', 'Series or Movie', 'Runtime',
       'Director', 'Writer', 'Actors', 'Release Date', 'Summary', 'IMDb Score',
       'released_day', 'released_month', 'released_year'],
      dtype='object')
In [24]:
# Avg Rating by Film Type
film_type_mean_rating = df.groupby('Series or Movie')['IMDb Score'].mean().reset_index()

fig = px.bar(film_type_mean_rating, x='Series or Movie', y='IMDb Score', text='IMDb Score', title='Average Rating by Film Type')
fig.update_traces(
    textposition= 'outside',
    texttemplate='%{text:.2f}',
    textfont_size= 16,
    marker_color= '#636EFA',
)
unique_scores = sorted(df['IMDb Score'].unique())
fig.update_layout(
    xaxis_title = 'Film Type',
    yaxis_title = 'Avg IMDb Score',
    title_x=0.5,
    title_y=0.85,
    width = 550,
    xaxis_title_font=dict(size=17),
    yaxis_title_font=dict(size=17),
    title_font=dict(size=22, family='Arial Black'),
    xaxis=dict(tickmode='linear', 
               tickfont=dict(size=14)),
    yaxis=dict(tickmode='linear',
               tickvals=unique_scores,
               range=[min(unique_scores)-0.5, max(unique_scores)+0.5],
               tickfont=dict(size=14)
               ),
    template='plotly_white',
)

config = {
    'toImageButtonOptions': {
        'filename': 'Rating and Film Type'
    }
}

fig.show(config=config)
In [25]:
sns.set_theme(style="white")
sns.kdeplot(data=df, x="IMDb Score", hue="Series or Movie", fill=True, alpha=0.4, linewidth=1.5)
# Add a title and labels to the plot using Matplotlib
plt.title("Rating Distribution by Film Type")
plt.xlabel("")
plt.ylabel("Density")
plt.savefig("./results/Rating Distribution by Film Type.png", dpi=300)
plt.show()
In [26]:
# Box plot
fig = px.box(df, x="Series or Movie", y="IMDb Score", title="Box Plot of IMDb Ratings by Film's Type")
fig.update_layout(
        template='plotly_white',
        xaxis_title = "Film's Type",
        yaxis_title = 'Avg IMDb Score',
        title_x=0.5,
        title_y=0.85,
        width=550,
        xaxis_title_font=dict(size=17),
        yaxis_title_font=dict(size=17),
        title_font=dict(size=22),
        xaxis=dict(tickfont=dict(size=14)),
        yaxis=dict(tickfont=dict(size=14)),
)
config = {
    'toImageButtonOptions': {
        'filename': 'Box Plot - Rating and Film type'
    }
}

fig.show(config=config)

Runtime¶

In [27]:
# Box plot
fig = px.box(df, x="Runtime", y="IMDb Score", title="Box Plot of IMDb Ratings by Runtime")
fig.update_layout(
        template='plotly_white',
        xaxis_title = "Film's Runtime",
        yaxis_title = 'Avg IMDb Score',
        title_x=0.5,
        title_y=0.85,
        width=550,
        xaxis_title_font=dict(size=17),
        yaxis_title_font=dict(size=17),
        title_font=dict(size=22),
        xaxis=dict(tickfont=dict(size=14)),
        yaxis=dict(tickfont=dict(size=14)),
)
config = {
    'toImageButtonOptions': {
        'filename': 'Box Plot - Rating and Runtime'
    }
}

fig.show(config=config)

Genre¶

Plot 1: Avg Rating of most Frequent Genres¶

In [28]:
genre_data = df.copy()
genre_data.dropna(subset=['Genre'], inplace=True)
In [29]:
# return processed text
def process_col(col):
    if pd.isna(col):
        return col
    col = [word.lower().strip() for word in col.split(',')]
    return ', '.join(col)
In [30]:
genre_data['Genre'] = genre_data['Genre'].apply(process_col)
In [31]:
# calculate the percentage of each value 
genre_frequency = genre_data['Genre'].value_counts() / len(genre_data) * 100 
genre_frequency = genre_frequency.reset_index()

print(genre_frequency.columns)
genre_frequency.columns = ['Genre', 'Percentage']
print(genre_frequency.columns)
genre_percentage = genre_frequency.sort_values(by='Percentage', ascending=False)
# save the top 20 
top_20_genres = genre_frequency.head(20)
Index(['Genre', 'count'], dtype='object')
Index(['Genre', 'Percentage'], dtype='object')
In [32]:
fig = px.bar(top_20_genres, x='Genre', y='Percentage', text='Percentage', title='20 Most Frequent Genre Combinations')
fig.update_traces(
    textposition='outside',
    texttemplate='%{text:.2f}%',
    textfont_size=16,
    marker_color='#636EFA',
)
fig.update_layout(
    xaxis_title = 'Genres',
    yaxis_title = 'Percentage',
    title_x=0.5,
    title_y=0.85,
    xaxis_title_font=dict(size=17),
    yaxis_title_font=dict(size=17),
    title_font=dict(size=22),
    xaxis=dict(tickmode='linear', 
               tickfont=dict(size=14)),
    yaxis=dict(tickmode='linear',
               tickvals=unique_scores,
               range=[min(unique_scores)-0.5, max(unique_scores)+0.5],
               tickfont=dict(size=14)
               ),
    template='plotly_white',
)

config = {
    'toImageButtonOptions': {
        'filename': '20 Most frequent Genre Combinations'
    }
}

fig.show(config=config)
In [33]:
# Plotting Avg Rating by Genre
genre_mean_imdb = genre_data.groupby('Genre')['IMDb Score'].mean().reset_index()
print(genre_mean_imdb.columns)
genre_mean_imdb.columns = ['Genre', 'Avg IMDb Score']

genre_stats = pd.merge(genre_percentage, genre_mean_imdb, on='Genre')
top_30_genres = genre_stats.head(20)
Index(['Genre', 'IMDb Score'], dtype='object')
In [34]:
fig = px.bar(top_30_genres, x='Genre', y='Avg IMDb Score', text='Avg IMDb Score', title='20 Most Frequent Genre Combinations and Avg Rating')
fig.update_traces(
    textposition='outside',
    texttemplate='%{text:.2f}',
    textfont_size=16,
    marker_color='#636EFA',
)
fig.update_layout(
    xaxis_title = 'Genres',
    yaxis_title = 'Avg IMDb Score',
    title_x=0.5,
    title_y=0.85,
    width=1000,
    xaxis_title_font=dict(size=17),
    yaxis_title_font=dict(size=17),
    title_font=dict(size=22, family='Arial Black'),
    xaxis=dict(tickmode='linear', 
               tickfont=dict(size=14)),
    yaxis=dict(tickmode='linear',
               tickvals=unique_scores,
               range=[min(unique_scores)-0.5, max(unique_scores)+0.5],
               tickfont=dict(size=14)
               ),
    template='plotly_white',
)

config = {
    'toImageButtonOptions': {
        'filename': '20 Most frequent Genre Combinations'
    }
}

fig.show(config=config)

Plot 2: Genres Frequency (The First Genre)¶

In [35]:
genre_data = df.copy()
genre_data.dropna(subset=['Genre'], inplace=True)
genre_data['Genre'] = genre_data['Genre'].apply(process_col)
In [36]:
# takes the first genre
def save_first_val(col):
    col = col.split(',')
    return col[0]

genre_data['Genre'] = genre_data['Genre'].apply(save_first_val)

# calculate the percentage of each value 
genre_frequency = genre_data['Genre'].value_counts() / len(genre_data) * 100 

genre_frequency = genre_frequency.reset_index()
print(genre_frequency.columns)
genre_frequency.columns = ['Genre', 'Percentage']
print(genre_frequency.columns)
genre_percentage = genre_frequency.sort_values(by='Percentage', ascending=False)
# save the top 20 
# top_20_genres = genre_frequency.head(20)
Index(['Genre', 'count'], dtype='object')
Index(['Genre', 'Percentage'], dtype='object')
In [37]:
fig = px.histogram(genre_percentage, x='Genre', y='Percentage', text_auto = True, title='Most Frequent Genre Combinations')
fig.update_traces(
    texttemplate='%{y:.2f}%',
    textfont=dict(size=50),
    textposition='outside',
    marker_color='#636EFA',
    insidetextfont=dict(size=30),
    outsidetextfont=dict(size=30),
)
fig.update_layout(
    xaxis_title='Genres',
    yaxis_title='Percentage',
    title_x=0.5,
    xaxis_title_font=dict(size=17),
    yaxis_title_font=dict(size=17),
    title_font=dict(size=22),
    xaxis=dict(
        tickmode='linear', 
        tickfont=dict(size=14)
    ),
    yaxis=dict(
        tickformat=".2f%%",
        tickfont=dict(size=14),
    ),
    template='plotly_white',
)

config = {
    'toImageButtonOptions': {
        'filename': 'Most frequent Genre Combinations'
    }
}
fig.show(config=config)
In [38]:
fig = px.bar(genre_percentage, x='Genre', y='Percentage', text='Percentage', title='Genre frequencies (showed as percentage)')
fig.update_traces(
    textposition='outside',
    texttemplate='%{text:.2f}',
    textfont_size= 30,
    marker_color='#636EFA',
)

fig.update_layout(
    height = 600,
    xaxis_title = 'Genres',
    yaxis_title = 'Percentage',
    title_x=0.5,
    xaxis_title_font=dict(size=17),
    yaxis_title_font=dict(size=17),
    title_font=dict(size=22),
    xaxis=dict(tickmode='linear', 
               tickfont=dict(size=14)),
    yaxis=dict(tickmode='linear',
               tickfont=dict(size=14)
               ),
    template='plotly_white',
)

config = {
    'toImageButtonOptions': {
        'filename': '20 Most frequent Genre Combinations'
    }
}

fig.show(config=config)

Plot 3: Avg Rating by Genre (Only First Genre)¶

In [39]:
genre_mean_imdb = genre_data.groupby('Genre')['IMDb Score'].mean().reset_index()
print(genre_mean_imdb.columns)
genre_mean_imdb.columns = ['Genre', 'Avg IMDb Score']
genre_stats = pd.merge(genre_percentage, genre_mean_imdb, on='Genre')
Index(['Genre', 'IMDb Score'], dtype='object')
In [40]:
# Plotting the top 30 genres with Avg IMDb Score
fig = px.bar(genre_stats, x='Genre', y='Avg IMDb Score', text='Avg IMDb Score', title='Avg Rating by Genre')
fig.update_traces(
    textposition='outside',
    texttemplate='%{text:.2f}%',  
    textfont_size=16,             
    marker_color='#636EFA',       
)
fig.update_layout(
    xaxis_title='Genres',          
    yaxis_title='Avg IMDb Score',  
    title_x=0.5,   
    title_y=0.85,
    xaxis_title_font=dict(size=17), 
    yaxis_title_font=dict(size=17),  
    title_font=dict(size=22, family='Arial Black'),        
    xaxis=dict(
        tickmode='linear', 
        tickfont=dict(size=14)
    ),
    yaxis=dict(
        tickmode='linear',
        tickvals=unique_scores,   
        range=[min(unique_scores)-0.5, max(unique_scores)+0.5], 
        tickfont=dict(size=14)
    ),
    template='plotly_white',       
)

config = {
    'toImageButtonOptions': {
        'filename': '20 Most frequent Genre Combinations'  
    }
}

fig.show(config=config)

Director¶

In [41]:
# copy to a new dataframe, delete nulls, and process text 
director_data = df.copy()
director_data.dropna(subset=['Director'], inplace=True)
print(director_data['Director'].head(15))
0                               Tomas Alfredson
1                                 Coky Giedroyc
2                                 Brendan Walsh
4                                 Stephen Irwin
5                                 Mez Tharatorn
8                                   Alf Sjöberg
9                                   Lasse Åberg
10                                 Jon Holmberg
11                               David S. Goyer
12                               Hans Alfredson
13                                  Lasse Åberg
14    José Esteban Alenda, César Esteban Alenda
15                                Todd Phillips
16                                 George Lucas
17                                  David Yates
Name: Director, dtype: object
In [42]:
# pre-process category by keeping the first value in each row
def pre_process_category(value):
    if isinstance(value, str):
        value.lower()
        list_val = value.split(',')
        return list_val[0].strip()
    else: 
        return value
    
director_data['Director'] = director_data['Director'].apply(pre_process_category)
print(director_data['Director'].head(15))
0         Tomas Alfredson
1           Coky Giedroyc
2           Brendan Walsh
4           Stephen Irwin
5           Mez Tharatorn
8             Alf Sjöberg
9             Lasse Åberg
10           Jon Holmberg
11         David S. Goyer
12         Hans Alfredson
13            Lasse Åberg
14    José Esteban Alenda
15          Todd Phillips
16           George Lucas
17            David Yates
Name: Director, dtype: object
In [43]:
# plot director
# calculate the percentage of each value 
director_data_freq = director_data['Director'].value_counts() / len(director_data) * 100 

director_data_freq = director_data_freq.reset_index()
print(director_data_freq.columns)
director_data_freq.columns = ['Director', 'Percentage']
print(director_data_freq.columns)
director_data_percentage = director_data_freq.sort_values(by='Percentage', ascending=False)


# Save the top 50 directors
top_50_directors = director_data_percentage.head(50)
Index(['Director', 'count'], dtype='object')
Index(['Director', 'Percentage'], dtype='object')
In [44]:
# Plotting the histogram
fig = px.histogram(top_50_directors, x='Director', y='Percentage', text_auto = True, title='Most Frequent Directors Combinations')
fig.update_traces(
    texttemplate='%{y:.2f}%',
    textfont=dict(size=50),
    textposition='outside',
    marker_color='#636EFA',
    insidetextfont=dict(size=30),
    outsidetextfont=dict(size=30),
)
fig.update_layout(
    xaxis_title='Director',
    yaxis_title='Percentage',
    title_x=0.5,
    xaxis_title_font=dict(size=17),
    yaxis_title_font=dict(size=17),
    title_font=dict(size=22),
    xaxis=dict(
        tickmode='linear', 
        tickfont=dict(size=14)
    ),
    yaxis=dict(
        tickformat=".2f%%",
        tickfont=dict(size=14),
    ),
    template='plotly_white',
)

config = {
    'toImageButtonOptions': {
        'filename': 'Most frequent Genre Combinations'
    }
}
fig.show(config=config)

Director: plotting avg rating by director¶

In [45]:
# Avg rating by genre table
director_mean_imdb = director_data.groupby('Director')['IMDb Score'].mean().reset_index()
print(director_mean_imdb.columns)
director_mean_imdb.columns = ['Director', 'Avg IMDb Score']
director_stats = pd.merge(top_50_directors, director_mean_imdb, on='Director')
Index(['Director', 'IMDb Score'], dtype='object')
In [46]:
# Plotting the bar plot for top 30 genres with Avg IMDb Score
fig = px.bar(director_stats, x='Director', y='Avg IMDb Score', text='Avg IMDb Score', title='Avg Rating by Director')
fig.update_traces(
    textposition='outside',
    texttemplate='%{text:.2f}%',  
    textfont_size=16,             
    marker_color='#636EFA',       
)
fig.update_layout(
    xaxis_title='Director',          
    yaxis_title='Avg IMDb Score',  
    title_x=0.5,   
    title_y=0.85,
    width=650,
    xaxis_title_font=dict(size=17), 
    yaxis_title_font=dict(size=17),  
    title_font=dict(size=22, family='Arial Black'),        
    xaxis=dict(
        tickmode='linear', 
        tickfont=dict(size=14)
    ),
    yaxis=dict(
        tickmode='linear',
        tickvals=unique_scores,   
        range=[min(unique_scores)-0.5, max(unique_scores)+0.5], 
        tickfont=dict(size=14)
    ),
    template='plotly_white',       
)

config = {
    'toImageButtonOptions': {
        'filename': '20 Most frequent Genre Combinations'  
    }
}

fig.show(config=config)

Writer¶

In [47]:
# copy to a new dataframe, delete nulls, and process text 
writer_data = df.copy()
writer_data.dropna(subset=['Writer'], inplace=True)
print(writer_data['Writer'].head(15))
writer_data['Writer'] = writer_data['Writer'].apply(pre_process_category)
print(writer_data['Writer'].head(15))
0                                 John Ajvide Lindqvist
1                                         Caitlin Moran
2                            Brendan Walsh, Daley Nixon
5     Pattaranad Bhiboonsawade, Thodsapon Thiptinnak...
8                                     Ivar Lo-Johansson
9                               Lasse Åberg, Bo Jonsson
10    Jon Holmberg, Daniella Mendel-Enk, Sara Young,...
11                Christine Roum, Mats Wahl, Mick Davis
12                                       Hans Alfredson
13                                          Lasse Åberg
14    José Esteban Alenda, Victoria Ruiz, César Este...
15    Scott Silver, Jerry Robinson, Todd Phillips, B...
16                                         George Lucas
17                           Steve Kloves, J.K. Rowling
19                                       Hans Alfredson
Name: Writer, dtype: object
0        John Ajvide Lindqvist
1                Caitlin Moran
2                Brendan Walsh
5     Pattaranad Bhiboonsawade
8            Ivar Lo-Johansson
9                  Lasse Åberg
10                Jon Holmberg
11              Christine Roum
12              Hans Alfredson
13                 Lasse Åberg
14         José Esteban Alenda
15                Scott Silver
16                George Lucas
17                Steve Kloves
19              Hans Alfredson
Name: Writer, dtype: object
In [48]:
# plot director
# calculate the percentage of each value 
writer_data_freq = writer_data['Writer'].value_counts() / len(director_data) * 100 

writer_data_freq = writer_data_freq.reset_index()
print(writer_data_freq.columns)
writer_data_freq.columns = ['Writer', 'Percentage']
print(writer_data_freq.columns)
writer_data_percentage = writer_data_freq.sort_values(by='Percentage', ascending=False)

# Save the top 50 directors
top_50_writers = writer_data_percentage.head(50)

# Avg rating by genre table
writer_mean_imdb = writer_data.groupby('Writer')['IMDb Score'].mean().reset_index()
print(writer_mean_imdb.columns)
writer_mean_imdb.columns = ['Writer', 'Avg IMDb Score']
writer_stats = pd.merge(top_50_writers, writer_mean_imdb, on='Writer')
Index(['Writer', 'count'], dtype='object')
Index(['Writer', 'Percentage'], dtype='object')
Index(['Writer', 'IMDb Score'], dtype='object')
In [49]:
# Plotting the bar plot for top 30 genres with Avg IMDb Score
fig = px.bar(writer_stats, x='Writer', y='Avg IMDb Score', text='Avg IMDb Score', title='Avg Rating by Writer')
fig.update_traces(
    textposition='outside',
    texttemplate='%{text:.2f}%',  
    textfont_size=16,             
    marker_color='#636EFA',       
)
fig.update_layout(
    xaxis_title='Writer',          
    yaxis_title='Avg IMDb Score',  
    title_x=0.5,   
    title_y=0.85,
    width=650,
    xaxis_title_font=dict(size=17), 
    yaxis_title_font=dict(size=17),  
    title_font=dict(size=22, family='Arial Black'),        
    xaxis=dict(
        tickmode='linear', 
        tickfont=dict(size=14)
    ),
    yaxis=dict(
        tickmode='linear',
        tickvals=unique_scores,   
        range=[min(unique_scores)-0.5, max(unique_scores)+0.5], 
        tickfont=dict(size=14)
    ),
    template='plotly_white',       
)

config = {
    'toImageButtonOptions': {
        'filename': '20 Most frequent Genre Combinations'  
    }
}

fig.show(config=config)

Actors¶

In [50]:
df.columns
Out[50]:
Index(['Title', 'Genre', 'Tags', 'Languages', 'Series or Movie', 'Runtime',
       'Director', 'Writer', 'Actors', 'Release Date', 'Summary', 'IMDb Score',
       'released_day', 'released_month', 'released_year'],
      dtype='object')
In [51]:
# copy to a new dataframe, delete nulls, and process text 
actor_data = df.copy()
actor_data.dropna(subset=['Actors'], inplace=True)
print(actor_data['Actors'].head(15))
actor_data['Actors'] = actor_data['Actors'].apply(pre_process_category)
print(actor_data['Actors'].head(15))
0     Kåre Hedebrant, Per Ragnar, Lina Leandersson, ...
1     Paddy Considine, Cleo, Beanie Feldstein, Dónal...
2                     Genesis Rodriguez, Vincent Piazza
3     Vahide Perçin, Gonca Vuslateri, Cansu Dere, Be...
4                                          Ragga Gudrun
5     Thiti Mahayotaruk, Nadech Kugimiya, Kathaleeya...
6     Marcin Dorocinski, Piotr Nowak, Julia Kijowska...
7     Pawel Królikowski, Szymon Bobrowski, Danuta St...
8     Ulf Palme, Ragnar Falck, Hugo Björne, Eva Dahl...
9     Jon Skolmen, Cecilia Walton, Lasse Åberg, Eva ...
10    Elis Gerdt, Tea Stjärne, Fredrik Hallgren, Bax...
11    Marcia Gay Harden, Chris Marquette, Margarita ...
12    Stellan Skarsgård, Hans Alfredson, Per Myrberg...
13    Jon Skolmen, Ida Högberg, Lasse Åberg, Tobias ...
14    Manolo Solo, Roger Príncep, Cristina Marcos, R...
Name: Actors, dtype: object
0        Kåre Hedebrant
1       Paddy Considine
2     Genesis Rodriguez
3         Vahide Perçin
4          Ragga Gudrun
5     Thiti Mahayotaruk
6     Marcin Dorocinski
7     Pawel Królikowski
8             Ulf Palme
9           Jon Skolmen
10           Elis Gerdt
11    Marcia Gay Harden
12    Stellan Skarsgård
13          Jon Skolmen
14          Manolo Solo
Name: Actors, dtype: object
In [52]:
# plot director
# calculate the percentage of each value 
actor_data_freq = actor_data['Actors'].value_counts() / len(director_data) * 100 

actor_data_freq = actor_data_freq.reset_index()
print(actor_data_freq.columns)
actor_data_freq.columns = ['Actors', 'Percentage']
print(actor_data_freq.columns)
actor_data_percentage = actor_data_freq.sort_values(by='Percentage', ascending=False)

# Save the top 50 directors
top_50_actors = actor_data_percentage.head(50)

# Avg rating by genre table
actor_mean_imdb = actor_data.groupby('Actors')['IMDb Score'].mean().reset_index()
print(actor_mean_imdb.columns)
actor_mean_imdb.columns = ['Actors', 'Avg IMDb Score']
actor_stats = pd.merge(top_50_actors, actor_mean_imdb, on='Actors')
Index(['Actors', 'count'], dtype='object')
Index(['Actors', 'Percentage'], dtype='object')
Index(['Actors', 'IMDb Score'], dtype='object')
In [53]:
# Plotting the bar plot for top 30 genres with Avg IMDb Score
fig = px.bar(actor_stats, x='Actors', y='Avg IMDb Score', text='Avg IMDb Score', title='Avg Rating by Actor')
fig.update_traces(
    textposition='outside',
    texttemplate='%{text:.2f}%',  
    textfont_size=16,             
    marker_color='#636EFA',       
)
fig.update_layout(
    xaxis_title='Actors',          
    yaxis_title='Avg IMDb Score',  
    title_x=0.5,   
    title_y=0.85,
    width=650,
    xaxis_title_font=dict(size=17), 
    yaxis_title_font=dict(size=17),  
    title_font=dict(size=22, family='Arial Black'),        
    xaxis=dict(
        tickmode='linear', 
        tickfont=dict(size=14)
    ),
    yaxis=dict(
        tickmode='linear',
        tickvals=unique_scores,   
        range=[min(unique_scores)-0.5, max(unique_scores)+0.5], 
        tickfont=dict(size=14)
    ),
    template='plotly_white',       
)

config = {
    'toImageButtonOptions': {
        'filename': '20 Most frequent Genre Combinations'  
    }
}

fig.show(config=config)
In [ ]: